#!/bin/bash

if [ -z "$MODELSCOPE_CACHE" ]; then
    export MODELSCOPE_CACHE="$HOME/.cache/modelscope/hub"
fi

modelscope login --token 'ms-856eaddb-8236-4561-a3ba-41f6d9c8c3cb'
modelscope download  --dataset  'lfz233002072/ebidding-laws-pt'
modelscope download  --dataset  BAAI/IndustryCorpus2_real_estate_construction --include 'chinese/high*'
modelscope download  --dataset  BAAI/IndustryCorpus2_mathematics_statistics --include 'chinese/high*'

export OUTPUT_DIR='/app/data/output/real_estate_construction-0__1B'
swift export \
    --model /root/.cache/pminfo/AiLab-0.1B \
    --dataset "${MODELSCOPE_CACHE}/datasets/BAAI/IndustryCorpus2_real_estate_construction/chinese/high" \
              "${MODELSCOPE_CACHE}/datasets/BAAI/IndustryCorpus2_mathematics_statistics/chinese/high" \
              "${MODELSCOPE_CACHE}/datasets/lfz233002072/ebidding-laws-pt/cebpubservice_20251112_nation.jsonl" \
              "${MODELSCOPE_CACHE}/datasets/lfz233002072/ebidding-laws-pt/cebpubservice_20251112_420000.jsonl" \
              "${MODELSCOPE_CACHE}/datasets/lfz233002072/ebidding-laws-pt/cebpubservice_20251112_420200.jsonl" \
              "${MODELSCOPE_CACHE}/datasets/lfz233002072/ebidding-laws-pt/pt_dataset_paragraph.jsonl" \
    --dataset_num_proc 64 \
    --to_cached_dataset true \
    --split_dataset_ratio 0.01 \
    --use_chat_template false \
    --loss_scale all \
    --output_dir "${OUTPUT_DIR}/pretrain_cached_dataset"
